Creating a logistic regression model with Theano

Based on Theano's tutorial.


In [2]:
# Importing libraries

import pandas as pd

import numpy as np

import theano
import theano.tensor as T

import matplotlib.pyplot as plt
%pylab inline 

import cPickle


Populating the interactive namespace from numpy and matplotlib

In [3]:
# Loading data

df = pd.read_csv('../data/train.csv')
df = df.astype(np.float64)

In [4]:
# Exploring data

print df.shape
df.iloc[0:10,0:10]


(42000, 785)
Out[4]:
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8
0 1 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0
6 7 0 0 0 0 0 0 0 0 0
7 3 0 0 0 0 0 0 0 0 0
8 5 0 0 0 0 0 0 0 0 0
9 3 0 0 0 0 0 0 0 0 0

In [5]:
# Showing some data

f, (ax1, ax2) = plt.subplots(ncols=2)
image_size = (28,28)
ax1.matshow( np.reshape(df.iloc[0,1:],image_size), cmap='gray_r')
ax2.matshow( np.reshape(df.iloc[7,1:],image_size), cmap='gray_r')
f.show()


/usr/lib/python2.7/dist-packages/matplotlib/figure.py:387: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "

In [6]:
# Organizing data

N = df.shape[0]
features = df.shape[1]-1
outputs = 10

def hot_vector_from_category_num(x):
    v = [0]*10
    v[int(x)] = 1
    return v

D = (
    df.iloc[:,1:],
    np.array(map(hot_vector_from_category_num, df.iloc[:,0]))
    )

In [7]:
# Making a logistic regression model with Theano

class LogisticRegressionModel(object):
    def __init__(self, inputs, outputs, learning_rate = 0.1):
        self.x = T.matrix('x') #Inputs
        self.y = T.matrix('y') #Output
        
        # Weights and bias     
        self.w = theano.shared(np.random.randn(inputs, outputs), name='w')
        self.b = theano.shared(np.zeros(outputs), name='b')

        
        # Theano expression graph

        ## Probability that target=1
        self.p_1 = ( 1.0 / (1.0 + T.exp(-T.dot(self.x,self.w)-self.b)) )

        ## The prediction thresholded
        #self.prediction = self.p_1 >= 0.5
        self.prediction = self.p_1 >= T.max(self.p_1)
        #self.prediction = self.p_1

        ## Cross-entropy loss function
        self.cross_entropy = -self.y*T.log(self.p_1) - (1-self.y)*T.log(1-self.p_1)

        ## The cost to minimize
        self.cost = self.cross_entropy.mean() + 0.01 * (self.w**2).sum()

        ## Computing the gradient of the cost
        self.gw, self.gb = T.grad(self.cost, [self.w, self.b])


        # Compiling the graph
        self.train = theano.function(
            inputs = [self.x, self.y],
            outputs = [self.prediction, self.cross_entropy],
            updates = ((self.w, self.w-learning_rate*self.gw), (self.b, self.b-learning_rate*self.gb))
        )
        
        self.predict = theano.function(
            inputs = [self.x],
            outputs = self.prediction
        )

In [8]:
# Creating a model instance and training it

print 'Creating model...'
my_model = LogisticRegressionModel(features, outputs)

print 'Training...'
training_steps = 1
for i in xrange(training_steps):
    pred, err = my_model.train(D[0], D[1])


Creating model...
Training...

In [9]:
# Checking percentage of error

pred_max = pred >= np.max(pred)
print np.sum(pred_max - D[1])/float(N)


4.24004761905

In [10]:
# Saving model

model_file = file('log_reg.model', 'wb')
cPickle.dump(my_model, model_file, protocol=cPickle.HIGHEST_PROTOCOL)
model_file.close()

In [11]:
# Loading a copy of the model

model_file = file('log_reg.model', 'rb')
another_model = cPickle.load(model_file)
model_file.close()

# Trying it
print another_model.predict(D[0])


[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]